2015 AAQoL: Machine Learning Approaches

Author

Miguel Fudolig, Luke Cho, Lawrence Kim, Boya Liu

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

There are 2,609 responses, some with missing data.

Machine Learning Classifications

We are going to analyze the prediction accuracy of different machine learning algorithms in predicting the source of health information, healthcare utilization outcomes, and insurance.

We will consider the following algorithms: - Random Forest - Logistic Regression

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = Family ~ Age + Gender + Religion + Income +      Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 42.96%
Confusion matrix:
     No Yes class.error
No  423 367   0.4645570
Yes 329 501   0.3963855
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  718  52
       Yes  72 778
                                          
               Accuracy : 0.9235          
                 95% CI : (0.9094, 0.9359)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.8467          
                                          
 Mcnemar's Test P-Value : 0.08796         
                                          
            Sensitivity : 0.9373          
            Specificity : 0.9089          
         Pos Pred Value : 0.9153          
         Neg Pred Value : 0.9325          
             Prevalence : 0.5123          
         Detection Rate : 0.4802          
   Detection Prevalence : 0.5247          
      Balanced Accuracy : 0.9231          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  176 127
       Yes 178 224
                                          
               Accuracy : 0.5674          
                 95% CI : (0.5299, 0.6043)
    No Information Rate : 0.5021          
    P-Value [Acc > NIR] : 0.0002993       
                                          
                  Kappa : 0.1353          
                                          
 Mcnemar's Test P-Value : 0.0041966       
                                          
            Sensitivity : 0.6382          
            Specificity : 0.4972          
         Pos Pred Value : 0.5572          
         Neg Pred Value : 0.5809          
             Prevalence : 0.4979          
         Detection Rate : 0.3177          
   Detection Prevalence : 0.5702          
      Balanced Accuracy : 0.5677          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$Family,pred_noeth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

set.seed(222)
ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))

train <- rfdata[ind==1,]
test <- rfdata[ind==2,]

randomForest::randomForest(Family~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = Family ~ ., data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 40.56%
Confusion matrix:
     No Yes class.error
No  444 351   0.4415094
Yes 306 519   0.3709091
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  722  31
       Yes  73 794
                                          
               Accuracy : 0.9358          
                 95% CI : (0.9227, 0.9472)
    No Information Rate : 0.5093          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8714          
                                          
 Mcnemar's Test P-Value : 5.81e-05        
                                          
            Sensitivity : 0.9624          
            Specificity : 0.9082          
         Pos Pred Value : 0.9158          
         Neg Pred Value : 0.9588          
             Prevalence : 0.5093          
         Detection Rate : 0.4901          
   Detection Prevalence : 0.5352          
      Balanced Accuracy : 0.9353          
                                          
       'Positive' Class : Yes             
                                          

Test Set

pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  192 140
       Yes 157 216
                                          
               Accuracy : 0.5787          
                 95% CI : (0.5413, 0.6155)
    No Information Rate : 0.505           
    P-Value [Acc > NIR] : 5.061e-05       
                                          
                  Kappa : 0.157           
                                          
 Mcnemar's Test P-Value : 0.3532          
                                          
            Sensitivity : 0.6067          
            Specificity : 0.5501          
         Pos Pred Value : 0.5791          
         Neg Pred Value : 0.5783          
             Prevalence : 0.5050          
         Detection Rate : 0.3064          
   Detection Prevalence : 0.5291          
      Balanced Accuracy : 0.5784          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$Family,pred_eth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6283
pROC::auc(rocobj_wo)
Area under the curve: 0.6175

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                    No        Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     7.918278 10.8645493            15.652593         66.03128
Age          16.654016 15.7460769            26.458185        170.45644
Gender        2.073286 11.5973472            10.273489         25.95290
Religion      4.617839  8.2180796            10.526109         72.95519
Employment    5.919339  3.9165010             8.723458         21.20784
Income       12.917360  0.6092244            10.286733         99.88059
EnglishSpeak  9.143444  8.3621486            15.168532         50.20068
EnglishDiff  12.539277  4.4662574            12.990193         57.47472

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  460 346
       Yes 335 479
                                          
               Accuracy : 0.5796          
                 95% CI : (0.5552, 0.6038)
    No Information Rate : 0.5093          
    P-Value [Acc > NIR] : 7.809e-09       
                                          
                  Kappa : 0.1592          
                                          
 Mcnemar's Test P-Value : 0.7016          
                                          
            Sensitivity : 0.5806          
            Specificity : 0.5786          
         Pos Pred Value : 0.5885          
         Neg Pred Value : 0.5707          
             Prevalence : 0.5093          
         Detection Rate : 0.2957          
   Detection Prevalence : 0.5025          
      Balanced Accuracy : 0.5796          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  194 156
       Yes 155 200
                                          
               Accuracy : 0.5589          
                 95% CI : (0.5213, 0.5959)
    No Information Rate : 0.505           
    P-Value [Acc > NIR] : 0.002342        
                                          
                  Kappa : 0.1177          
                                          
 Mcnemar's Test P-Value : 1.000000        
                                          
            Sensitivity : 0.5618          
            Specificity : 0.5559          
         Pos Pred Value : 0.5634          
         Neg Pred Value : 0.5543          
             Prevalence : 0.5050          
         Detection Rate : 0.2837          
   Detection Prevalence : 0.5035          
      Balanced Accuracy : 0.5588          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$Family,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.5862

With ethnicity

Training Set

mod1 <- glm(Family~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  488 308
       Yes 307 517
                                          
               Accuracy : 0.6204          
                 95% CI : (0.5962, 0.6441)
    No Information Rate : 0.5093          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.2405          
                                          
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.6267          
            Specificity : 0.6138          
         Pos Pred Value : 0.6274          
         Neg Pred Value : 0.6131          
             Prevalence : 0.5093          
         Detection Rate : 0.3191          
   Detection Prevalence : 0.5086          
      Balanced Accuracy : 0.6203          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  211 145
       Yes 138 211
                                         
               Accuracy : 0.5986         
                 95% CI : (0.5613, 0.635)
    No Information Rate : 0.505          
    P-Value [Acc > NIR] : 3.696e-07      
                                         
                  Kappa : 0.1972         
                                         
 Mcnemar's Test P-Value : 0.7213         
                                         
            Sensitivity : 0.5927         
            Specificity : 0.6046         
         Pos Pred Value : 0.6046         
         Neg Pred Value : 0.5927         
             Prevalence : 0.5050         
         Detection Rate : 0.2993         
   Detection Prevalence : 0.4950         
      Balanced Accuracy : 0.5986         
                                         
       'Positive' Class : Yes            
                                         

ROC

rocobj_w <-pROC::roc(test$Family,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6195

Source of Information: Health Professional

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

set.seed(222)
pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`=="No")

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Heal Professionals` ~ Age + Gender +      Religion + Income + Employment + EnglishSpeak + EnglishDiff,      data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 41.23%
Confusion matrix:
     No Yes class.error
No  428 362   0.4582278
Yes 306 524   0.3686747
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  677  46
       Yes 113 784
                                          
               Accuracy : 0.9019          
                 95% CI : (0.8863, 0.9159)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8032          
                                          
 Mcnemar's Test P-Value : 1.658e-07       
                                          
            Sensitivity : 0.9446          
            Specificity : 0.8570          
         Pos Pred Value : 0.8740          
         Neg Pred Value : 0.9364          
             Prevalence : 0.5123          
         Detection Rate : 0.4840          
   Detection Prevalence : 0.5537          
      Balanced Accuracy : 0.9008          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  181 126
       Yes 174 225
                                          
               Accuracy : 0.5751          
                 95% CI : (0.5376, 0.6119)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 6.996e-05       
                                          
                  Kappa : 0.1508          
                                          
 Mcnemar's Test P-Value : 0.006657        
                                          
            Sensitivity : 0.6410          
            Specificity : 0.5099          
         Pos Pred Value : 0.5639          
         Neg Pred Value : 0.5896          
             Prevalence : 0.4972          
         Detection Rate : 0.3187          
   Detection Prevalence : 0.5652          
      Balanced Accuracy : 0.5754          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Heal Professionals`,pred_noeth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Heal Professionals`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Heal Professionals` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 42.1%
Confusion matrix:
     No Yes class.error
No  409 381   0.4822785
Yes 301 529   0.3626506
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  704  43
       Yes  86 787
                                          
               Accuracy : 0.9204          
                 95% CI : (0.9061, 0.9331)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8404          
                                          
 Mcnemar's Test P-Value : 0.0002174       
                                          
            Sensitivity : 0.9482          
            Specificity : 0.8911          
         Pos Pred Value : 0.9015          
         Neg Pred Value : 0.9424          
             Prevalence : 0.5123          
         Detection Rate : 0.4858          
   Detection Prevalence : 0.5389          
      Balanced Accuracy : 0.9197          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  179 122
       Yes 176 229
                                          
               Accuracy : 0.5779          
                 95% CI : (0.5405, 0.6147)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 3.752e-05       
                                          
                  Kappa : 0.1565          
                                          
 Mcnemar's Test P-Value : 0.002139        
                                          
            Sensitivity : 0.6524          
            Specificity : 0.5042          
         Pos Pred Value : 0.5654          
         Neg Pred Value : 0.5947          
             Prevalence : 0.4972          
         Detection Rate : 0.3244          
   Detection Prevalence : 0.5737          
      Balanced Accuracy : 0.5783          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Heal Professionals`,pred_eth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6269
pROC::auc(rocobj_wo)
Area under the curve: 0.6207

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     No         Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     5.9860318  9.60330404            13.049013         68.63717
Age           7.0329957  4.75500036            10.080950        154.94856
Gender        5.3711218 -2.95635281             2.138476         25.31325
Religion      9.7787112 -0.02549784             7.782594         72.44490
Employment   -1.5342540 -1.77234860            -2.779661         20.60935
Income       -0.7468972 15.76511423            11.391666        100.79240
EnglishSpeak  7.2331614 19.06777183            22.152922         53.80531
EnglishDiff   3.7127127  9.91157749            10.384253         58.43659

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  484 289
       Yes 306 541
                                          
               Accuracy : 0.6327          
                 95% CI : (0.6087, 0.6562)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.2646          
                                          
 Mcnemar's Test P-Value : 0.5119          
                                          
            Sensitivity : 0.6518          
            Specificity : 0.6127          
         Pos Pred Value : 0.6387          
         Neg Pred Value : 0.6261          
             Prevalence : 0.5123          
         Detection Rate : 0.3340          
   Detection Prevalence : 0.5228          
      Balanced Accuracy : 0.6322          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  198 123
       Yes 157 228
                                          
               Accuracy : 0.6034          
                 95% CI : (0.5662, 0.6397)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 5.022e-08       
                                          
                  Kappa : 0.2072          
                                          
 Mcnemar's Test P-Value : 0.0486          
                                          
            Sensitivity : 0.6496          
            Specificity : 0.5577          
         Pos Pred Value : 0.5922          
         Neg Pred Value : 0.6168          
             Prevalence : 0.4972          
         Detection Rate : 0.3229          
   Detection Prevalence : 0.5453          
      Balanced Accuracy : 0.6037          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Heal Professionals`,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6429

With ethnicity

Training Set

mod1 <- glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  485 294
       Yes 305 536
                                          
               Accuracy : 0.6302          
                 95% CI : (0.6062, 0.6538)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.2598          
                                          
 Mcnemar's Test P-Value : 0.6828          
                                          
            Sensitivity : 0.6458          
            Specificity : 0.6139          
         Pos Pred Value : 0.6373          
         Neg Pred Value : 0.6226          
             Prevalence : 0.5123          
         Detection Rate : 0.3309          
   Detection Prevalence : 0.5191          
      Balanced Accuracy : 0.6299          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  203 130
       Yes 152 221
                                          
               Accuracy : 0.6006          
                 95% CI : (0.5634, 0.6369)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 1.147e-07       
                                          
                  Kappa : 0.2014          
                                          
 Mcnemar's Test P-Value : 0.2111          
                                          
            Sensitivity : 0.6296          
            Specificity : 0.5718          
         Pos Pred Value : 0.5925          
         Neg Pred Value : 0.6096          
             Prevalence : 0.4972          
         Detection Rate : 0.3130          
   Detection Prevalence : 0.5283          
      Balanced Accuracy : 0.6007          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Heal Professionals`,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6418

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Health Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Health Insurance`=="Yes")
neg <- rfdata |> filter(`Health Insurance`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Health Insurance` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 13.58%
Confusion matrix:
     0  Yes class.error
0   16  203  0.92694064
Yes 17 1384  0.01213419
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    132    2
       Yes   87 1399
                                          
               Accuracy : 0.9451          
                 95% CI : (0.9328, 0.9557)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.719           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9986          
            Specificity : 0.6027          
         Pos Pred Value : 0.9415          
         Neg Pred Value : 0.9851          
             Prevalence : 0.8648          
         Detection Rate : 0.8636          
   Detection Prevalence : 0.9173          
      Balanced Accuracy : 0.8007          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0     7  16
       Yes  92 591
                                          
               Accuracy : 0.847           
                 95% CI : (0.8183, 0.8728)
    No Information Rate : 0.8598          
    P-Value [Acc > NIR] : 0.8483          
                                          
                  Kappa : 0.0653          
                                          
 Mcnemar's Test P-Value : 5.319e-13       
                                          
            Sensitivity : 0.97364         
            Specificity : 0.07071         
         Pos Pred Value : 0.86530         
         Neg Pred Value : 0.30435         
             Prevalence : 0.85977         
         Detection Rate : 0.83711         
   Detection Prevalence : 0.96742         
      Balanced Accuracy : 0.52217         
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Health Insurance`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Health Insurance`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Health Insurance` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 13.46%
Confusion matrix:
     0  Yes class.error
0   18  201  0.91780822
Yes 17 1384  0.01213419
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    156    2
       Yes   63 1399
                                          
               Accuracy : 0.9599          
                 95% CI : (0.9491, 0.9689)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8056          
                                          
 Mcnemar's Test P-Value : 9.911e-14       
                                          
            Sensitivity : 0.9986          
            Specificity : 0.7123          
         Pos Pred Value : 0.9569          
         Neg Pred Value : 0.9873          
             Prevalence : 0.8648          
         Detection Rate : 0.8636          
   Detection Prevalence : 0.9025          
      Balanced Accuracy : 0.8555          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    10  12
       Yes  89 595
                                          
               Accuracy : 0.8569          
                 95% CI : (0.8289, 0.8819)
    No Information Rate : 0.8598          
    P-Value [Acc > NIR] : 0.6114          
                                          
                  Kappa : 0.1204          
                                          
 Mcnemar's Test P-Value : 3.961e-14       
                                          
            Sensitivity : 0.9802          
            Specificity : 0.1010          
         Pos Pred Value : 0.8699          
         Neg Pred Value : 0.4545          
             Prevalence : 0.8598          
         Detection Rate : 0.8428          
   Detection Prevalence : 0.9688          
      Balanced Accuracy : 0.5406          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Health Insurance`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6666
pROC::auc(rocobj_wo)
Area under the curve: 0.6534

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0       Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     2.400857 15.368673            16.107945         33.73518
Age           5.864815 11.132723            13.570007         72.90367
Gender       -6.129335  7.159769             3.938031         12.15898
Religion      3.565750 12.763937            13.211563         36.02166
Employment    4.416671 13.315128            14.557834         11.33611
Income       19.851050 11.218608            18.695617         61.88486
EnglishSpeak 21.295650 13.206903            20.516757         31.91132
EnglishDiff  -3.356981  9.806920             7.919856         26.51002

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0     17   14
       Yes  202 1387
                                          
               Accuracy : 0.8667          
                 95% CI : (0.8491, 0.8829)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : 0.4313          
                                          
                  Kappa : 0.106           
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.99001         
            Specificity : 0.07763         
         Pos Pred Value : 0.87288         
         Neg Pred Value : 0.54839         
             Prevalence : 0.86481         
         Detection Rate : 0.85617         
   Detection Prevalence : 0.98086         
      Balanced Accuracy : 0.53382         
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    12   7
       Yes  87 600
                                         
               Accuracy : 0.8669         
                 95% CI : (0.8396, 0.891)
    No Information Rate : 0.8598         
    P-Value [Acc > NIR] : 0.3164         
                                         
                  Kappa : 0.1657         
                                         
 Mcnemar's Test P-Value : 3.693e-16      
                                         
            Sensitivity : 0.9885         
            Specificity : 0.1212         
         Pos Pred Value : 0.8734         
         Neg Pred Value : 0.6316         
             Prevalence : 0.8598         
         Detection Rate : 0.8499         
   Detection Prevalence : 0.9731         
      Balanced Accuracy : 0.5548         
                                         
       'Positive' Class : Yes            
                                         

ROC

rocobj_wo <-pROC::roc(test$`Health Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6906

With ethnicity

Training Set

mod1 <- glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0     20   13
       Yes  199 1388
                                          
               Accuracy : 0.8691          
                 95% CI : (0.8517, 0.8852)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : 0.3208          
                                          
                  Kappa : 0.1279          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.99072         
            Specificity : 0.09132         
         Pos Pred Value : 0.87461         
         Neg Pred Value : 0.60606         
             Prevalence : 0.86481         
         Detection Rate : 0.85679         
   Detection Prevalence : 0.97963         
      Balanced Accuracy : 0.54102         
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0     9  10
       Yes  90 597
                                          
               Accuracy : 0.8584          
                 95% CI : (0.8304, 0.8832)
    No Information Rate : 0.8598          
    P-Value [Acc > NIR] : 0.5695          
                                          
                  Kappa : 0.1125          
                                          
 Mcnemar's Test P-Value : 2.789e-15       
                                          
            Sensitivity : 0.98353         
            Specificity : 0.09091         
         Pos Pred Value : 0.86900         
         Neg Pred Value : 0.47368         
             Prevalence : 0.85977         
         Detection Rate : 0.84561         
   Detection Prevalence : 0.97309         
      Balanced Accuracy : 0.53722         
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Health Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6863

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Dental Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Dental Insurance`=="Yes")
neg <- rfdata |> filter(`Dental Insurance`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Dental Insurance` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 26.89%
Confusion matrix:
      0 Yes class.error
0   384 254   0.3981191
Yes 181 799   0.1846939
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   543  32
       Yes  95 948
                                          
               Accuracy : 0.9215          
                 95% CI : (0.9073, 0.9341)
    No Information Rate : 0.6057          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8328          
                                          
 Mcnemar's Test P-Value : 3.763e-08       
                                          
            Sensitivity : 0.9673          
            Specificity : 0.8511          
         Pos Pred Value : 0.9089          
         Neg Pred Value : 0.9443          
             Prevalence : 0.6057          
         Detection Rate : 0.5859          
   Detection Prevalence : 0.6446          
      Balanced Accuracy : 0.9092          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   167  79
       Yes 110 347
                                          
               Accuracy : 0.7312          
                 95% CI : (0.6967, 0.7636)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : 2.214e-12       
                                          
                  Kappa : 0.4258          
                                          
 Mcnemar's Test P-Value : 0.0291          
                                          
            Sensitivity : 0.8146          
            Specificity : 0.6029          
         Pos Pred Value : 0.7593          
         Neg Pred Value : 0.6789          
             Prevalence : 0.6060          
         Detection Rate : 0.4936          
   Detection Prevalence : 0.6501          
      Balanced Accuracy : 0.7087          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dental Insurance`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Dental Insurance`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Dental Insurance` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 26.14%
Confusion matrix:
      0 Yes class.error
0   387 251   0.3934169
Yes 172 808   0.1755102
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   567  19
       Yes  71 961
                                         
               Accuracy : 0.9444         
                 95% CI : (0.9321, 0.955)
    No Information Rate : 0.6057         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.8819         
                                         
 Mcnemar's Test P-Value : 7.621e-08      
                                         
            Sensitivity : 0.9806         
            Specificity : 0.8887         
         Pos Pred Value : 0.9312         
         Neg Pred Value : 0.9676         
             Prevalence : 0.6057         
         Detection Rate : 0.5939         
   Detection Prevalence : 0.6378         
      Balanced Accuracy : 0.9347         
                                         
       'Positive' Class : Yes            
                                         
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   168  76
       Yes 109 350
                                          
               Accuracy : 0.7368          
                 95% CI : (0.7026, 0.7691)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : 2.077e-13       
                                          
                  Kappa : 0.4372          
                                          
 Mcnemar's Test P-Value : 0.01864         
                                          
            Sensitivity : 0.8216          
            Specificity : 0.6065          
         Pos Pred Value : 0.7625          
         Neg Pred Value : 0.6885          
             Prevalence : 0.6060          
         Detection Rate : 0.4979          
   Detection Prevalence : 0.6529          
      Balanced Accuracy : 0.7140          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dental Insurance`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.7998
pROC::auc(rocobj_wo)
Area under the curve: 0.7967

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0       Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     5.994484 17.831023            19.295433         63.13087
Age           7.250900 31.263847            31.737507        139.82902
Gender       -6.854679 12.278566             5.521210         22.30855
Religion     -2.362729 13.833300             9.637888         61.08582
Employment   13.831350 31.792709            37.500551         38.80522
Income       38.744005 46.182120            57.569929        159.48297
EnglishSpeak 21.815707 19.476879            32.219878         62.49798
EnglishDiff   3.099861  6.366257             7.458982         48.81062

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   395 184
       Yes 243 796
                                          
               Accuracy : 0.7361          
                 95% CI : (0.7139, 0.7574)
    No Information Rate : 0.6057          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4384          
                                          
 Mcnemar's Test P-Value : 0.005003        
                                          
            Sensitivity : 0.8122          
            Specificity : 0.6191          
         Pos Pred Value : 0.7661          
         Neg Pred Value : 0.6822          
             Prevalence : 0.6057          
         Detection Rate : 0.4920          
   Detection Prevalence : 0.6422          
      Balanced Accuracy : 0.7157          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   181  76
       Yes  96 350
                                          
               Accuracy : 0.7553          
                 95% CI : (0.7218, 0.7867)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.4811          
                                          
 Mcnemar's Test P-Value : 0.1474          
                                          
            Sensitivity : 0.8216          
            Specificity : 0.6534          
         Pos Pred Value : 0.7848          
         Neg Pred Value : 0.7043          
             Prevalence : 0.6060          
         Detection Rate : 0.4979          
   Detection Prevalence : 0.6344          
      Balanced Accuracy : 0.7375          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Dental Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.8144

With ethnicity

Training Set

mod1 <- glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   398 175
       Yes 240 805
                                          
               Accuracy : 0.7435          
                 95% CI : (0.7215, 0.7646)
    No Information Rate : 0.6057          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.4533          
                                          
 Mcnemar's Test P-Value : 0.00168         
                                          
            Sensitivity : 0.8214          
            Specificity : 0.6238          
         Pos Pred Value : 0.7703          
         Neg Pred Value : 0.6946          
             Prevalence : 0.6057          
         Detection Rate : 0.4975          
   Detection Prevalence : 0.6459          
      Balanced Accuracy : 0.7226          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   184  74
       Yes  93 352
                                          
               Accuracy : 0.7624          
                 95% CI : (0.7292, 0.7935)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.4965          
                                          
 Mcnemar's Test P-Value : 0.1637          
                                          
            Sensitivity : 0.8263          
            Specificity : 0.6643          
         Pos Pred Value : 0.7910          
         Neg Pred Value : 0.7132          
             Prevalence : 0.6060          
         Detection Rate : 0.5007          
   Detection Prevalence : 0.6330          
      Balanced Accuracy : 0.7453          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Dental Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.815
pROC::auc(rocobj_wo)
Area under the curve: 0.8144

Urgent Care utilization in the past 12 months

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Urgentcare`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Urgentcare`=="Yes")
neg <- rfdata |> filter(`Urgentcare`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = Urgentcare ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 17.66%
Confusion matrix:
       0 Yes class.error
0   1306  14  0.01060606
Yes  268   9  0.96750903
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  174
       Yes    0  103
                                          
               Accuracy : 0.891           
                 95% CI : (0.8747, 0.9059)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : 3.815e-13       
                                          
                  Kappa : 0.4946          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.3718          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.8835          
             Prevalence : 0.1735          
         Detection Rate : 0.0645          
   Detection Prevalence : 0.0645          
      Balanced Accuracy : 0.6859          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   578 117
       Yes   3   2
                                          
               Accuracy : 0.8286          
                 95% CI : (0.7986, 0.8558)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.5642          
                                          
                  Kappa : 0.0188          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.016807        
            Specificity : 0.994836        
         Pos Pred Value : 0.400000        
         Neg Pred Value : 0.831655        
             Prevalence : 0.170000        
         Detection Rate : 0.002857        
   Detection Prevalence : 0.007143        
      Balanced Accuracy : 0.505822        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Urgentcare`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Urgentcare`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = Urgentcare ~ ., data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 17.66%
Confusion matrix:
       0 Yes class.error
0   1309  11 0.008333333
Yes  271   6 0.978339350
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  146
       Yes    0  131
                                          
               Accuracy : 0.9086          
                 95% CI : (0.8934, 0.9223)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5973          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.47292         
            Specificity : 1.00000         
         Pos Pred Value : 1.00000         
         Neg Pred Value : 0.90041         
             Prevalence : 0.17345         
         Detection Rate : 0.08203         
   Detection Prevalence : 0.08203         
      Balanced Accuracy : 0.73646         
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   577 117
       Yes   4   2
                                          
               Accuracy : 0.8271          
                 95% CI : (0.7971, 0.8544)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.6033          
                                          
                  Kappa : 0.0159          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.016807        
            Specificity : 0.993115        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.831412        
             Prevalence : 0.170000        
         Detection Rate : 0.002857        
   Detection Prevalence : 0.008571        
      Balanced Accuracy : 0.504961        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Urgentcare`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls > cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.4858
pROC::auc(rocobj_wo)
Area under the curve: 0.5102

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0        Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity    10.686415 -6.1763833            8.5003474         35.33908
Age           4.496937  7.7223094            7.6703428        100.89055
Gender       -0.119335  2.2184180            0.8534296         14.10049
Religion     12.831846 -2.7225625           12.0576909         36.31904
Employment    5.839689 -3.5613963            4.4406385         12.21524
Income        3.736587  2.4811992            4.5254163         53.60340
EnglishSpeak 11.614478  0.9775154           11.5592823         25.38791
EnglishDiff   9.580822 -2.3648761            8.3388440         30.57526

Urgent care IS NOT really predicted well. Might as well just say “yes” to everything since accuracy is not significantly different from NIR.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  277
       Yes    0    0
                                          
               Accuracy : 0.8265          
                 95% CI : (0.8071, 0.8448)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : 0.516           
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.0000          
            Specificity : 1.0000          
         Pos Pred Value :    NaN          
         Neg Pred Value : 0.8265          
             Prevalence : 0.1735          
         Detection Rate : 0.0000          
   Detection Prevalence : 0.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   581 119
       Yes   0   0
                                          
               Accuracy : 0.83            
                 95% CI : (0.8001, 0.8571)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.5245          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.00            
            Specificity : 1.00            
         Pos Pred Value :  NaN            
         Neg Pred Value : 0.83            
             Prevalence : 0.17            
         Detection Rate : 0.00            
   Detection Prevalence : 0.00            
      Balanced Accuracy : 0.50            
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Urgentcare`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.5547

With ethnicity

Training Set

mod1 <- glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  277
       Yes    0    0
                                          
               Accuracy : 0.8265          
                 95% CI : (0.8071, 0.8448)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : 0.516           
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.0000          
            Specificity : 1.0000          
         Pos Pred Value :    NaN          
         Neg Pred Value : 0.8265          
             Prevalence : 0.1735          
         Detection Rate : 0.0000          
   Detection Prevalence : 0.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   581 119
       Yes   0   0
                                          
               Accuracy : 0.83            
                 95% CI : (0.8001, 0.8571)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.5245          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.00            
            Specificity : 1.00            
         Pos Pred Value :  NaN            
         Neg Pred Value : 0.83            
             Prevalence : 0.17            
         Detection Rate : 0.00            
   Detection Prevalence : 0.00            
      Balanced Accuracy : 0.50            
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Urgentcare`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.5521

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Physical Check-up`=="Yes")
neg <- rfdata |> filter(`Physical Check-up`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Physical Check-up` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 30.7%
Confusion matrix:
      0 Yes class.error
0   154 365   0.7032755
Yes 129 961   0.1183486
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    385    5
       Yes  134 1085
                                          
               Accuracy : 0.9136          
                 95% CI : (0.8988, 0.9269)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.7886          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9954          
            Specificity : 0.7418          
         Pos Pred Value : 0.8901          
         Neg Pred Value : 0.9872          
             Prevalence : 0.6774          
         Detection Rate : 0.6743          
   Detection Prevalence : 0.7576          
      Balanced Accuracy : 0.8686          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    60  67
       Yes 170 406
                                          
               Accuracy : 0.6629          
                 95% CI : (0.6266, 0.6978)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.7277          
                                          
                  Kappa : 0.1347          
                                          
 Mcnemar's Test P-Value : 3.458e-11       
                                          
            Sensitivity : 0.8584          
            Specificity : 0.2609          
         Pos Pred Value : 0.7049          
         Neg Pred Value : 0.4724          
             Prevalence : 0.6728          
         Detection Rate : 0.5775          
   Detection Prevalence : 0.8193          
      Balanced Accuracy : 0.5596          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Physical Check-up`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Physical Check-up`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Physical Check-up` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 30.39%
Confusion matrix:
      0 Yes class.error
0   163 356   0.6859345
Yes 133 957   0.1220183
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    409    4
       Yes  110 1086
                                          
               Accuracy : 0.9291          
                 95% CI : (0.9155, 0.9412)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8287          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9963          
            Specificity : 0.7881          
         Pos Pred Value : 0.9080          
         Neg Pred Value : 0.9903          
             Prevalence : 0.6774          
         Detection Rate : 0.6750          
   Detection Prevalence : 0.7433          
      Balanced Accuracy : 0.8922          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    61  65
       Yes 169 408
                                          
               Accuracy : 0.6671          
                 95% CI : (0.6309, 0.7019)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.6427          
                                          
                  Kappa : 0.1446          
                                          
 Mcnemar's Test P-Value : 1.658e-11       
                                          
            Sensitivity : 0.8626          
            Specificity : 0.2652          
         Pos Pred Value : 0.7071          
         Neg Pred Value : 0.4841          
             Prevalence : 0.6728          
         Detection Rate : 0.5804          
   Detection Prevalence : 0.8208          
      Balanced Accuracy : 0.5639          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Physical Check-up`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6691
pROC::auc(rocobj_wo)
Area under the curve: 0.647

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                      0      Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity    -2.1405730 12.25127            10.297182         58.32849
Age           5.5814758 28.76194            27.503520        148.18282
Gender       -1.4156418 12.00199             8.760002         23.49004
Religion     -7.6574489 14.90969             8.434901         63.27393
Employment   -8.4761204 19.80178            15.557200         20.80459
Income        1.7335994 26.88213            23.456748         96.62966
EnglishSpeak -3.6512705 17.67902            15.079959         43.68058
EnglishDiff   0.9252643 12.65443            11.215809         49.64512

No change between accuracy and no information rate on the test set.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   183 110
       Yes 336 980
                                          
               Accuracy : 0.7228          
                 95% CI : (0.7002, 0.7446)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : 4.52e-05        
                                          
                  Kappa : 0.2841          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.8991          
            Specificity : 0.3526          
         Pos Pred Value : 0.7447          
         Neg Pred Value : 0.6246          
             Prevalence : 0.6774          
         Detection Rate : 0.6091          
   Detection Prevalence : 0.8179          
      Balanced Accuracy : 0.6258          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    60  65
       Yes 170 408
                                          
               Accuracy : 0.6657          
                 95% CI : (0.6295, 0.7005)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.6721          
                                          
                  Kappa : 0.1398          
                                          
 Mcnemar's Test P-Value : 1.167e-11       
                                          
            Sensitivity : 0.8626          
            Specificity : 0.2609          
         Pos Pred Value : 0.7059          
         Neg Pred Value : 0.4800          
             Prevalence : 0.6728          
         Detection Rate : 0.5804          
   Detection Prevalence : 0.8222          
      Balanced Accuracy : 0.5617          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Physical Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6736

With ethnicity

Training Set

mod1 <- glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   191 116
       Yes 328 974
                                          
               Accuracy : 0.7241          
                 95% CI : (0.7015, 0.7458)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : 2.852e-05       
                                          
                  Kappa : 0.2929          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.8936          
            Specificity : 0.3680          
         Pos Pred Value : 0.7481          
         Neg Pred Value : 0.6221          
             Prevalence : 0.6774          
         Detection Rate : 0.6053          
   Detection Prevalence : 0.8092          
      Balanced Accuracy : 0.6308          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    72  64
       Yes 158 409
                                          
               Accuracy : 0.6842          
                 95% CI : (0.6484, 0.7184)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.2743          
                                          
                  Kappa : 0.1986          
                                          
 Mcnemar's Test P-Value : 4.327e-10       
                                          
            Sensitivity : 0.8647          
            Specificity : 0.3130          
         Pos Pred Value : 0.7213          
         Neg Pred Value : 0.5294          
             Prevalence : 0.6728          
         Detection Rate : 0.5818          
   Detection Prevalence : 0.8065          
      Balanced Accuracy : 0.5889          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Physical Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.6884
pROC::auc(rocobj_wo)
Area under the curve: 0.6736

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Dentist Check-up`=="Yes")
neg <- rfdata |> filter(`Dentist Check-up`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Dentist Check-up` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 32.75%
Confusion matrix:
      0 Yes class.error
0   352 311   0.4690799
Yes 214 726   0.2276596
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   567  44
       Yes  96 896
                                         
               Accuracy : 0.9127         
                 95% CI : (0.8978, 0.926)
    No Information Rate : 0.5864         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.8178         
                                         
 Mcnemar's Test P-Value : 1.63e-05       
                                         
            Sensitivity : 0.9532         
            Specificity : 0.8552         
         Pos Pred Value : 0.9032         
         Neg Pred Value : 0.9280         
             Prevalence : 0.5864         
         Detection Rate : 0.5590         
   Detection Prevalence : 0.6188         
      Balanced Accuracy : 0.9042         
                                         
       'Positive' Class : Yes            
                                         

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   145  99
       Yes 149 309
                                          
               Accuracy : 0.6467          
                 95% CI : (0.6101, 0.6821)
    No Information Rate : 0.5812          
    P-Value [Acc > NIR] : 0.0002248       
                                          
                  Kappa : 0.2566          
                                          
 Mcnemar's Test P-Value : 0.0018614       
                                          
            Sensitivity : 0.7574          
            Specificity : 0.4932          
         Pos Pred Value : 0.6747          
         Neg Pred Value : 0.5943          
             Prevalence : 0.5812          
         Detection Rate : 0.4402          
   Detection Prevalence : 0.6524          
      Balanced Accuracy : 0.6253          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dentist Check-up`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Dentist Check-up`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Dentist Check-up` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 32.81%
Confusion matrix:
      0 Yes class.error
0   354 309   0.4660633
Yes 217 723   0.2308511
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   593  45
       Yes  70 895
                                          
               Accuracy : 0.9283          
                 95% CI : (0.9145, 0.9404)
    No Information Rate : 0.5864          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.8513          
                                          
 Mcnemar's Test P-Value : 0.02522         
                                          
            Sensitivity : 0.9521          
            Specificity : 0.8944          
         Pos Pred Value : 0.9275          
         Neg Pred Value : 0.9295          
             Prevalence : 0.5864          
         Detection Rate : 0.5583          
   Detection Prevalence : 0.6020          
      Balanced Accuracy : 0.9233          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   140  93
       Yes 154 315
                                          
               Accuracy : 0.6481          
                 95% CI : (0.6115, 0.6835)
    No Information Rate : 0.5812          
    P-Value [Acc > NIR] : 0.0001672       
                                          
                  Kappa : 0.2557          
                                          
 Mcnemar's Test P-Value : 0.0001347       
                                          
            Sensitivity : 0.7721          
            Specificity : 0.4762          
         Pos Pred Value : 0.6716          
         Neg Pred Value : 0.6009          
             Prevalence : 0.5812          
         Detection Rate : 0.4487          
   Detection Prevalence : 0.6681          
      Balanced Accuracy : 0.6241          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dentist Check-up`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6809
pROC::auc(rocobj_wo)
Area under the curve: 0.6656

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0       Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     2.637328 19.611727            23.513297         65.33439
Age          10.744222 22.345648            27.273211        150.85344
Gender        9.136014  2.322054             8.988817         25.43963
Religion     -1.532119 23.866492            23.414587         73.33316
Employment    6.797482  8.650243            12.627362         22.84096
Income       17.154001 27.930689            33.181691        118.57301
EnglishSpeak 11.087684 20.980028            26.048237         57.89318
EnglishDiff   8.755849  7.419699            12.420752         56.88743

Accuracy slightly decreased when ethnicity is added to the model, but AUC increased.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   363 179
       Yes 300 761
                                          
               Accuracy : 0.7012          
                 95% CI : (0.6781, 0.7235)
    No Information Rate : 0.5864          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.367           
                                          
 Mcnemar's Test P-Value : 4.183e-08       
                                          
            Sensitivity : 0.8096          
            Specificity : 0.5475          
         Pos Pred Value : 0.7172          
         Neg Pred Value : 0.6697          
             Prevalence : 0.5864          
         Detection Rate : 0.4747          
   Detection Prevalence : 0.6619          
      Balanced Accuracy : 0.6785          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   141  93
       Yes 153 315
                                         
               Accuracy : 0.6496         
                 95% CI : (0.613, 0.6849)
    No Information Rate : 0.5812         
    P-Value [Acc > NIR] : 0.0001236      
                                         
                  Kappa : 0.259          
                                         
 Mcnemar's Test P-Value : 0.0001688      
                                         
            Sensitivity : 0.7721         
            Specificity : 0.4796         
         Pos Pred Value : 0.6731         
         Neg Pred Value : 0.6026         
             Prevalence : 0.5812         
         Detection Rate : 0.4487         
   Detection Prevalence : 0.6667         
      Balanced Accuracy : 0.6258         
                                         
       'Positive' Class : Yes            
                                         

ROC

rocobj_wo <-pROC::roc(test$`Dentist Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6846

With ethnicity

Training Set

mod1 <- glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   370 191
       Yes 293 749
                                          
               Accuracy : 0.6981          
                 95% CI : (0.6749, 0.7205)
    No Information Rate : 0.5864          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3631          
                                          
 Mcnemar's Test P-Value : 4.413e-06       
                                          
            Sensitivity : 0.7968          
            Specificity : 0.5581          
         Pos Pred Value : 0.7188          
         Neg Pred Value : 0.6595          
             Prevalence : 0.5864          
         Detection Rate : 0.4672          
   Detection Prevalence : 0.6500          
      Balanced Accuracy : 0.6774          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   146  94
       Yes 148 314
                                          
               Accuracy : 0.6553          
                 95% CI : (0.6188, 0.6904)
    No Information Rate : 0.5812          
    P-Value [Acc > NIR] : 3.479e-05       
                                          
                  Kappa : 0.2732          
                                          
 Mcnemar's Test P-Value : 0.0006569       
                                          
            Sensitivity : 0.7696          
            Specificity : 0.4966          
         Pos Pred Value : 0.6797          
         Neg Pred Value : 0.6083          
             Prevalence : 0.5812          
         Detection Rate : 0.4473          
   Detection Prevalence : 0.6581          
      Balanced Accuracy : 0.6331          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Dentist Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6887

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Folkmedicine`=="Yes")
neg <- rfdata |> filter(`Folkmedicine`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = Folkmedicine ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 14.05%
Confusion matrix:
       0 Yes class.error
0   1361  11 0.008017493
Yes  212   3 0.986046512
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1372  128
       Yes    0   87
                                          
               Accuracy : 0.9193          
                 95% CI : (0.9048, 0.9323)
    No Information Rate : 0.8645          
    P-Value [Acc > NIR] : 6.136e-12       
                                          
                  Kappa : 0.5403          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.40465         
            Specificity : 1.00000         
         Pos Pred Value : 1.00000         
         Neg Pred Value : 0.91467         
             Prevalence : 0.13548         
         Detection Rate : 0.05482         
   Detection Prevalence : 0.05482         
      Balanced Accuracy : 0.70233         
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   598  96
       Yes   2   1
                                          
               Accuracy : 0.8594          
                 95% CI : (0.8314, 0.8844)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.5702          
                                          
                  Kappa : 0.0117          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.010309        
            Specificity : 0.996667        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.861671        
             Prevalence : 0.139168        
         Detection Rate : 0.001435        
   Detection Prevalence : 0.004304        
      Balanced Accuracy : 0.503488        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Folkmedicine`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Folkmedicine`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = Folkmedicine ~ ., data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 14.11%
Confusion matrix:
       0 Yes class.error
0   1360  12 0.008746356
Yes  212   3 0.986046512
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1372   93
       Yes    0  122
                                          
               Accuracy : 0.9414          
                 95% CI : (0.9287, 0.9524)
    No Information Rate : 0.8645          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.694           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.56744         
            Specificity : 1.00000         
         Pos Pred Value : 1.00000         
         Neg Pred Value : 0.93652         
             Prevalence : 0.13548         
         Detection Rate : 0.07687         
   Detection Prevalence : 0.07687         
      Balanced Accuracy : 0.78372         
                                          
       'Positive' Class : Yes             
                                          

Test Set

pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   595  96
       Yes   5   1
                                          
               Accuracy : 0.8551          
                 95% CI : (0.8267, 0.8804)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.6923          
                                          
                  Kappa : 0.0033          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.010309        
            Specificity : 0.991667        
         Pos Pred Value : 0.166667        
         Neg Pred Value : 0.861071        
             Prevalence : 0.139168        
         Detection Rate : 0.001435        
   Detection Prevalence : 0.008608        
      Balanced Accuracy : 0.500988        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Folkmedicine`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6394
pROC::auc(rocobj_wo)
Area under the curve: 0.6139

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0        Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity    12.726453  6.3822426            15.118640         32.25924
Age          12.115027 10.9895607            15.384077         81.72420
Gender        2.006013  3.8682783             3.404765         12.07102
Religion      3.071722  0.8760851             3.184835         30.79800
Employment   10.548735  1.1738049            10.962190         11.33764
Income        7.361344 -2.8518109             5.614595         46.51250
EnglishSpeak  9.355979 -1.1478322             8.759329         23.88430
EnglishDiff   4.538927  0.6254122             4.520140         26.85372

Accuracy slightly decreased when ethnicity is added to the model.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, train$Folkmedicine, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1372  215
       Yes    0    0
                                         
               Accuracy : 0.8645         
                 95% CI : (0.8467, 0.881)
    No Information Rate : 0.8645         
    P-Value [Acc > NIR] : 0.5182         
                                         
                  Kappa : 0              
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.0000         
            Specificity : 1.0000         
         Pos Pred Value :    NaN         
         Neg Pred Value : 0.8645         
             Prevalence : 0.1355         
         Detection Rate : 0.0000         
   Detection Prevalence : 0.0000         
      Balanced Accuracy : 0.5000         
                                         
       'Positive' Class : Yes            
                                         

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   598  96
       Yes   2   1
                                          
               Accuracy : 0.8594          
                 95% CI : (0.8314, 0.8844)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.5702          
                                          
                  Kappa : 0.0117          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.010309        
            Specificity : 0.996667        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.861671        
             Prevalence : 0.139168        
         Detection Rate : 0.001435        
   Detection Prevalence : 0.004304        
      Balanced Accuracy : 0.503488        
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Folkmedicine`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6245

With ethnicity

Training Set

mod1 <- glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1369  212
       Yes    3    3
                                         
               Accuracy : 0.8645         
                 95% CI : (0.8467, 0.881)
    No Information Rate : 0.8645         
    P-Value [Acc > NIR] : 0.5182         
                                         
                  Kappa : 0.0199         
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.013953       
            Specificity : 0.997813       
         Pos Pred Value : 0.500000       
         Neg Pred Value : 0.865908       
             Prevalence : 0.135476       
         Detection Rate : 0.001890       
   Detection Prevalence : 0.003781       
      Balanced Accuracy : 0.505883       
                                         
       'Positive' Class : Yes            
                                         

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   596  97
       Yes   4   0
                                          
               Accuracy : 0.8551          
                 95% CI : (0.8267, 0.8804)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.6923          
                                          
                  Kappa : -0.0111         
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.000000        
            Specificity : 0.993333        
         Pos Pred Value : 0.000000        
         Neg Pred Value : 0.860029        
             Prevalence : 0.139168        
         Detection Rate : 0.000000        
   Detection Prevalence : 0.005739        
      Balanced Accuracy : 0.496667        
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Folkmedicine`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.6418
pROC::auc(rocobj_wo)
Area under the curve: 0.6245